This files contains an example of tuning a Logistic Regression model with BayesSearchCV
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train, return_style=True)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train, return_style=True)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
from sklearn.preprocessing import OrdinalEncoder
OrdinalEncoder().fit_transform(X_train[['purpose', 'savings_status']])
array([[0., 2.],
[2., 2.],
[9., 1.],
...,
[9., 3.],
[6., 4.],
[6., 2.]])
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)
['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents'] ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
numeric_pipeline = Pipeline([
#tune whether or not we want to impute or simply remove rows with missing values
('imputer', hlp.sklearn_pipeline.TransformerChooser()),
# this is here so that we can select between MinMax and Scaler
# if this pipeline is ran in a context outside of tuning, no transformation will take place
('scaler', hlp.sklearn_pipeline.TransformerChooser()),
])
non_numeric_pipeline = Pipeline([
('encoder', hlp.sklearn_pipeline.TransformerChooser()),
])
from sklearn.compose import ColumnTransformer
transformations_pipeline = ColumnTransformer([
('numeric', numeric_pipeline, numeric_columns),
('non_numeric', non_numeric_pipeline, non_numeric_columns)
])
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import KernelPCA
logistic_model = LogisticRegression(
solver='lbfgs',
max_iter=1000,
random_state=42
)
full_pipeline = Pipeline([
('prep', transformations_pipeline),
# ('pca', KernelPCA()),
#('pca', hlp.sklearn_pipeline.TransformerChooser()),
('model', logistic_model)
])
# Show the levels of pipelines/transformers/model
full_pipeline.named_steps
{'prep': ColumnTransformer(transformers=[('numeric',
Pipeline(steps=[('imputer',
TransformerChooser()),
('scaler',
TransformerChooser())]),
['duration', 'credit_amount',
'installment_commitment', 'residence_since',
'age', 'existing_credits',
'num_dependents']),
('non_numeric',
Pipeline(steps=[('encoder',
TransformerChooser())]),
['checking_status', 'credit_history',
'purpose', 'savings_status', 'employment',
'personal_status', 'other_parties',
'property_magnitude', 'other_payment_plans',
'housing', 'job', 'own_telephone',
'foreign_worker'])]),
'model': LogisticRegression(max_iter=1000, random_state=42)}
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score # , roc_auc_score
from sklearn.metrics import SCORERS
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
# https://stackoverflow.com/questions/60615281/different-result-roc-auc-score-and-plot-roc-curve
scores = {
# https://github.com/scikit-learn/scikit-learn/blob/2beed5584/sklearn/metrics/_scorer.py#L537
'ROC/AUC': SCORERS['roc_auc'],
'F1': make_scorer(f1_score, greater_is_better=True),
'Pos. Pred. Val': make_scorer(precision_score, greater_is_better=True),
'True Pos. Rate': make_scorer(recall_score, greater_is_better=True),
}
num_folds = 5
num_repeats = 2
# pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold
search_space = {
'prep__numeric__imputer__transformer': Categorical([SimpleImputer(strategy='mean')]),
'prep__numeric__scaler__transformer': Categorical([
None,
MinMaxScaler(),
StandardScaler()
]),
'prep__non_numeric__encoder__transformer': Categorical([
OneHotEncoder(),
hlp.sklearn_pipeline.CustomOrdinalEncoder()
]),
# 'pca__transformer': Categorical([
# None,
# KernelPCA(n_components=5, kernel='rbf'),
# KernelPCA(n_components=5, kernel='sigmoid'),
# KernelPCA(n_components=5, kernel='linear'),
# ]),
# 'pca__n_components': Integer(3, X_train.shape[1]),
# 'pca__gamma': Real(0.03, 0.05),
# 'pca__kernel': Categorical(['rbf', 'sigmoid']),
'model__C': Real(0.01, 100)
}
bayes_search = BayesSearchCV(
estimator=full_pipeline,
search_spaces=search_space,
n_iter=40,
cv=RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats),
scoring='roc_auc',
#return_train_score=True,
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
del search_space
Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits
/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.
Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits
/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.
Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits
/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.
Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits
/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.
Fitting 10 folds for each of 1 candidates, totalling 10 fits
/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.
Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits
/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.
Fitting 10 folds for each of 1 candidates, totalling 10 fits
/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.
Fitting 10 folds for each of 1 candidates, totalling 10 fits Fitting 10 folds for each of 1 candidates, totalling 10 fits
/Users/shanekercheval/opt/anaconda3/envs/python-examples/lib/python3.9/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.
Fitting 10 folds for each of 1 candidates, totalling 10 fits
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 31.967 seconds; 0.5 minutes
print(bayes_search.cv_results_)
{'mean_fit_time': array([0.05778637, 0.04239061, 0.05243752, 0.03702188, 0.03293884,
0.07713568, 0.03653631, 0.03425539, 0.04791358, 0.03773611,
0.04417167, 0.01475456, 0.01055415, 0.03753324, 0.02018332,
0.09075487, 0.01683578, 0.0307265 , 0.10254204, 0.01785464,
0.02788417, 0.03260612, 0.01231656, 0.01688731, 0.03496861,
0.01382778, 0.09754269, 0.01712813, 0.01210411, 0.01715906,
0.02718132, 0.0123688 , 0.07041507, 0.01668544, 0.01391535,
0.02891979, 0.01269498, 0.01185124, 0.03138831, 0.01117296]), 'std_fit_time': array([0.00648998, 0.00708358, 0.01155594, 0.00875364, 0.0093445 ,
0.01996607, 0.00823643, 0.00402763, 0.00971215, 0.0056371 ,
0.01450828, 0.00416567, 0.00117038, 0.010382 , 0.00763564,
0.04964332, 0.00324008, 0.01043235, 0.03642722, 0.00533483,
0.00735898, 0.01829786, 0.00082589, 0.00500761, 0.00630785,
0.00278647, 0.03462432, 0.0048105 , 0.00149949, 0.00735189,
0.00560827, 0.001987 , 0.01660823, 0.00425347, 0.00382317,
0.00470269, 0.00240134, 0.00203203, 0.00824969, 0.00153924]), 'mean_score_time': array([0.01896415, 0.01487114, 0.01288242, 0.00624564, 0.00581465,
0.01134057, 0.01480517, 0.01530533, 0.01046803, 0.01585138,
0.00614612, 0.00508809, 0.00379152, 0.00583932, 0.00428429,
0.00447242, 0.00448968, 0.00648947, 0.00592687, 0.00460117,
0.00385704, 0.00660472, 0.00373719, 0.0063494 , 0.00660176,
0.00656879, 0.00474763, 0.00696344, 0.00379338, 0.0054677 ,
0.00423253, 0.00645633, 0.01490698, 0.00607178, 0.00846229,
0.00439963, 0.00499566, 0.00706275, 0.01406505, 0.00422969]), 'std_score_time': array([6.91529206e-03, 3.50741869e-03, 2.63529334e-03, 2.58575311e-03,
2.35387174e-03, 3.17664916e-03, 5.39983974e-03, 5.74045638e-03,
2.05925071e-03, 5.41276768e-03, 2.89372902e-03, 1.79577083e-03,
9.46059102e-05, 1.55295710e-03, 1.64424139e-03, 8.97764620e-04,
5.16806656e-04, 2.65436723e-03, 3.75296715e-03, 3.54973400e-04,
5.38622556e-05, 2.84415669e-03, 1.36993434e-04, 1.83572476e-03,
3.76529121e-03, 3.63151950e-03, 1.73952408e-03, 2.72217554e-03,
1.31258131e-04, 1.92932098e-03, 5.79434596e-04, 3.07548225e-03,
4.48215177e-03, 1.35678267e-03, 4.97355218e-03, 9.82885323e-04,
1.76324916e-03, 3.50563984e-03, 5.63402456e-03, 1.07431411e-03]), 'param_model__C': masked_array(data=[41.01629484574285, 83.7404616717729, 44.48880290499217,
81.24147487585276, 79.95734862630124,
73.40546151837698, 61.711765056646605,
54.34487273835435, 95.54824385112973,
0.3729944224017752, 0.10782142349722036, 0.01,
0.011789053213525975, 15.04196091869179, 0.01,
99.93559191507566, 0.1618231487614719,
99.96376231964459, 99.94408396465242,
0.13360590518867257, 99.92063580907275,
0.013014924253593022, 0.09636808784098591, 0.01,
42.85344118168013, 0.01869204399018796,
99.95908273992939, 0.01, 0.09120539568143743,
0.04099894126722584, 42.97404946079402, 0.01,
0.09674632014186073, 0.01, 0.01, 99.90278398264437,
0.01, 0.01, 0.06743556144613103, 0.01],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False],
fill_value='?',
dtype=object), 'param_prep__non_numeric__encoder__transformer': masked_array(data=[CustomOrdinalEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), OneHotEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
CustomOrdinalEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), CustomOrdinalEncoder(),
OneHotEncoder(), OneHotEncoder(), OneHotEncoder(),
OneHotEncoder(), OneHotEncoder(),
CustomOrdinalEncoder(), OneHotEncoder()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False],
fill_value='?',
dtype=object), 'param_prep__numeric__imputer__transformer': masked_array(data=[SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer(), SimpleImputer(), SimpleImputer(),
SimpleImputer()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False],
fill_value='?',
dtype=object), 'param_prep__numeric__scaler__transformer': masked_array(data=[MinMaxScaler(), StandardScaler(), MinMaxScaler(),
StandardScaler(), MinMaxScaler(), None,
StandardScaler(), StandardScaler(), MinMaxScaler(),
MinMaxScaler(), None, StandardScaler(),
StandardScaler(), StandardScaler(), None, None,
StandardScaler(), StandardScaler(), None,
StandardScaler(), StandardScaler(), None,
StandardScaler(), StandardScaler(), StandardScaler(),
MinMaxScaler(), None, StandardScaler(), MinMaxScaler(),
MinMaxScaler(), MinMaxScaler(), StandardScaler(), None,
StandardScaler(), StandardScaler(), MinMaxScaler(),
StandardScaler(), StandardScaler(), StandardScaler(),
StandardScaler()],
mask=[False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False],
fill_value='?',
dtype=object), 'params': [OrderedDict([('model__C', 41.01629484574285), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__C', 83.7404616717729), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 44.48880290499217), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__C', 81.24147487585276), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 79.95734862630124), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__C', 73.40546151837698), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__C', 61.711765056646605), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 54.34487273835435), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 95.54824385112973), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__C', 0.3729944224017752), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__C', 0.10782142349722036), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__C', 0.01), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 0.011789053213525975), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 15.04196091869179), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 0.01), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__C', 99.93559191507566), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__C', 0.1618231487614719), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 99.96376231964459), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 99.94408396465242), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__C', 0.13360590518867257), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 99.92063580907275), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 0.013014924253593022), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__C', 0.09636808784098591), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 0.01), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 42.85344118168013), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 0.01869204399018796), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__C', 99.95908273992939), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__C', 0.01), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 0.09120539568143743), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__C', 0.04099894126722584), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__C', 42.97404946079402), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__C', 0.01), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 0.09674632014186073), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', None)]), OrderedDict([('model__C', 0.01), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 0.01), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 99.90278398264437), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', MinMaxScaler())]), OrderedDict([('model__C', 0.01), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 0.01), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 0.06743556144613103), ('prep__non_numeric__encoder__transformer', CustomOrdinalEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())]), OrderedDict([('model__C', 0.01), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())])], 'split0_test_score': array([0.68506436, 0.71229524, 0.70036364, 0.80466301, 0.72218182,
0.70637985, 0.80750605, 0.74235893, 0.72690909, 0.69715272,
0.72790846, 0.7684052 , 0.80003975, 0.72234935, 0.74089973,
0.75729469, 0.79003493, 0.74844662, 0.80642633, 0.73484397,
0.64603106, 0.78251484, 0.77894935, 0.76584758, 0.70530627,
0.76565295, 0.76353276, 0.71391369, 0.7422331 , 0.79780536,
0.77038961, 0.72509885, 0.74233766, 0.74001193, 0.70628019,
0.71011396, 0.776 , 0.71866097, 0.7302813 , 0.75480519]), 'split1_test_score': array([0.68647505, 0.77275234, 0.79555556, 0.72483092, 0.72311254,
0.74441964, 0.82054545, 0.68200161, 0.68791667, 0.73305381,
0.73636364, 0.71854545, 0.68069264, 0.75497835, 0.77539257,
0.76116071, 0.76563636, 0.77860502, 0.79519451, 0.77653263,
0.80597527, 0.70436364, 0.79279279, 0.79961173, 0.77583333,
0.72536688, 0.75509091, 0.80327266, 0.74915483, 0.77008929,
0.76811594, 0.79594618, 0.73592373, 0.81867822, 0.80262373,
0.831045 , 0.76563636, 0.71943372, 0.73275362, 0.87444196]), 'split2_test_score': array([0.75753835, 0.68954996, 0.69236364, 0.7507686 , 0.71759515,
0.69211779, 0.67411255, 0.74609375, 0.70807978, 0.71468877,
0.81393639, 0.79006085, 0.7582299 , 0.80103359, 0.79104167,
0.80022321, 0.7230371 , 0.77744304, 0.764 , 0.74056604,
0.75895876, 0.68108974, 0.80842067, 0.78490909, 0.74671704,
0.76674107, 0.74213075, 0.77256849, 0.74203472, 0.70647343,
0.72358631, 0.79465989, 0.66516458, 0.72953869, 0.77650648,
0.78418803, 0.78158539, 0.79003493, 0.74222448, 0.70263533]), 'split3_test_score': array([0.71786834, 0.70366133, 0.74441964, 0.78963563, 0.77259887,
0.73854545, 0.72708333, 0.77695742, 0.85621663, 0.73344494,
0.79996402, 0.73123487, 0.75635474, 0.80841878, 0.66631054,
0.69248575, 0.77183788, 0.73468661, 0.72830918, 0.80625437,
0.7342615 , 0.80302127, 0.72609819, 0.74737692, 0.75822762,
0.79910714, 0.79542278, 0.74516369, 0.82202611, 0.72441621,
0.71400966, 0.71510297, 0.73217391, 0.75875 , 0.73682317,
0.70833333, 0.73254545, 0.84090909, 0.73009673, 0.68464193]), 'split4_test_score': array([0.7836497 , 0.75595133, 0.71168155, 0.70404272, 0.81491815,
0.72186924, 0.67348008, 0.71343767, 0.69798531, 0.76264881,
0.71064815, 0.79231975, 0.82801932, 0.67143402, 0.76127273,
0.7624628 , 0.774 , 0.76722432, 0.69900412, 0.80991007,
0.75051779, 0.71478261, 0.79157115, 0.70347374, 0.7703622 ,
0.77101935, 0.71476844, 0.77687818, 0.79018182, 0.82414429,
0.84409716, 0.76246439, 0.72597254, 0.73490427, 0.77489698,
0.68932618, 0.75117555, 0.72596844, 0.73457676, 0.68526786]), 'split5_test_score': array([0.64098532, 0.71388216, 0.70558039, 0.82849072, 0.77199354,
0.7381202 , 0.73884391, 0.72349352, 0.68211068, 0.74981818,
0.79420759, 0.67765016, 0.82031405, 0.77345455, 0.70804729,
0.77157738, 0.764 , 0.7563285 , 0.75710145, 0.77583333,
0.7112069 , 0.6975 , 0.765016 , 0.733791 , 0.77179177,
0.80737705, 0.74537037, 0.77043478, 0.8022262 , 0.78215067,
0.78610474, 0.71151515, 0.73020631, 0.73322302, 0.78985108,
0.72618182, 0.72566372, 0.76655983, 0.7003632 , 0.79250612]), 'split6_test_score': array([0.77326468, 0.72903605, 0.73623534, 0.73658445, 0.7275466 ,
0.72571628, 0.67227053, 0.71751014, 0.731779 , 0.70246686,
0.735 , 0.816875 , 0.75613527, 0.76798524, 0.70469912,
0.69905956, 0.78449361, 0.78010664, 0.69442912, 0.7675841 ,
0.79775694, 0.73865327, 0.79254545, 0.75174703, 0.758 ,
0.82532051, 0.77043478, 0.74290249, 0.73150236, 0.71652174,
0.69777778, 0.83829138, 0.71888533, 0.81618182, 0.78102052,
0.81021767, 0.79095422, 0.81936813, 0.79245714, 0.70251082]), 'split7_test_score': array([0.76576916, 0.75762777, 0.74239351, 0.6678973 , 0.70711211,
0.72351421, 0.75688244, 0.71521577, 0.75955564, 0.6938244 ,
0.74527273, 0.72163636, 0.7326473 , 0.68887363, 0.74994876,
0.76442308, 0.74110337, 0.73165954, 0.81458333, 0.7148847 ,
0.7865781 , 0.75565361, 0.75283324, 0.77678571, 0.84599034,
0.72354167, 0.71400966, 0.79433333, 0.83140283, 0.79812466,
0.76048818, 0.73376012, 0.68872955, 0.74137931, 0.76802862,
0.77755376, 0.79690909, 0.84662711, 0.70362199, 0.79507056]), 'split8_test_score': array([0.78645833, 0.75499191, 0.701373 , 0.64090909, 0.74706183,
0.76426285, 0.74497768, 0.74275949, 0.75564972, 0.70642796,
0.79842033, 0.79489867, 0.73913818, 0.76476812, 0.74057731,
0.80892449, 0.78233495, 0.81603423, 0.76699863, 0.75277725,
0.70033482, 0.7514245 , 0.7831445 , 0.7456377 , 0.74859903,
0.76144423, 0.8043758 , 0.74083492, 0.68866667, 0.75157402,
0.75285024, 0.73748991, 0.78472727, 0.75050561, 0.7077753 ,
0.79069368, 0.76909091, 0.68508185, 0.76265209, 0.68763636]), 'split9_test_score': array([0.65024458, 0.70858688, 0.72751133, 0.85211688, 0.77777778,
0.66547406, 0.73179333, 0.72331544, 0.72394697, 0.80663616,
0.7309953 , 0.82618182, 0.76741064, 0.78417709, 0.79909622,
0.75447185, 0.79519774, 0.72227452, 0.72063321, 0.81577496,
0.80540648, 0.7352208 , 0.73509091, 0.79116223, 0.74089612,
0.72811594, 0.71818182, 0.75587774, 0.77883185, 0.74343149,
0.74554843, 0.78395395, 0.75524617, 0.78359993, 0.77 ,
0.67261905, 0.76019324, 0.74014137, 0.73584595, 0.81273837]), 'mean_test_score': array([0.72473179, 0.7298335 , 0.72574776, 0.74999393, 0.74818984,
0.72204196, 0.73474954, 0.72831437, 0.73301495, 0.73001626,
0.75927166, 0.76378081, 0.76389818, 0.75374727, 0.74372859,
0.75720835, 0.76916759, 0.7612809 , 0.75466799, 0.76949614,
0.74970276, 0.73642243, 0.77264623, 0.76003427, 0.76217237,
0.76736868, 0.75233181, 0.761618 , 0.76782605, 0.76147312,
0.75629681, 0.75982828, 0.7279367 , 0.76067728, 0.76138061,
0.75002725, 0.76497539, 0.76527854, 0.73648733, 0.74922545]), 'std_test_score': array([0.0529559 , 0.02692856, 0.02927373, 0.06535708, 0.03293425,
0.02677802, 0.04966651, 0.02406426, 0.04794827, 0.03359875,
0.03585324, 0.04637652, 0.04156315, 0.0433992 , 0.03932065,
0.03521985, 0.02122866, 0.02691546, 0.04091035, 0.03243326,
0.04993016, 0.03637849, 0.02572482, 0.02791494, 0.03393256,
0.03353195, 0.03037193, 0.02568565, 0.04247159, 0.0373298 ,
0.0391961 , 0.03997799, 0.03135058, 0.03203834, 0.03152763,
0.05228948, 0.02218081, 0.05362965, 0.0251646 , 0.0633458 ]), 'rank_test_score': array([39, 35, 38, 25, 28, 40, 32, 36, 33, 34, 18, 9, 8, 22, 29, 19, 3,
14, 21, 2, 26, 31, 1, 16, 10, 5, 23, 11, 4, 12, 20, 17, 37, 15,
13, 24, 7, 6, 30, 27], dtype=int32)}
print(bayes_search.best_score_)
0.7726462267067131
print(bayes_search.best_params_)
OrderedDict([('model__C', 0.09636808784098591), ('prep__non_numeric__encoder__transformer', OneHotEncoder()), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())])
new_param_column_names = {
#'pca__transformer': 'pca',
#'pca__n_components': 'pca: n_comps',
#'pca__gamma': 'pca: gamma',
#'pca__kernel': 'pca: kernel',
'prep__non_numeric__encoder__transformer': 'encoder',
'prep__numeric__imputer__transformer': 'imputer',
'prep__numeric__scaler__transformer': 'scaler',
'model__C': 'C'
}
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = new_param_column_names
)
results.to_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results.best_primary_score
0.7726462267067131
results.best_primary_score_params
{'C': 0.09636808784098591,
'encoder': 'OneHotEncoder()',
'imputer': 'SimpleImputer()',
'scaler': 'StandardScaler()'}
results.to_formatted_dataframe(num_rows=20)
| roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | C | encoder | scaler |
|---|---|---|---|---|---|
| 0.773 | 0.754 | 0.791 | 0.096 | OneHotEncoder() | StandardScaler() |
| 0.769 | 0.746 | 0.793 | 0.134 | OneHotEncoder() | StandardScaler() |
| 0.769 | 0.754 | 0.784 | 0.162 | OneHotEncoder() | StandardScaler() |
| 0.768 | 0.737 | 0.798 | 0.091 | OneHotEncoder() | MinMaxScaler() |
| 0.767 | 0.743 | 0.791 | 0.019 | OneHotEncoder() | MinMaxScaler() |
| 0.765 | 0.727 | 0.804 | 0.010 | OneHotEncoder() | StandardScaler() |
| 0.765 | 0.749 | 0.781 | 0.010 | OneHotEncoder() | StandardScaler() |
| 0.764 | 0.734 | 0.794 | 0.012 | OneHotEncoder() | StandardScaler() |
| 0.764 | 0.731 | 0.797 | 0.010 | OneHotEncoder() | StandardScaler() |
| 0.762 | 0.738 | 0.786 | 42.853 | OneHotEncoder() | StandardScaler() |
| 0.762 | 0.743 | 0.780 | 0.010 | OneHotEncoder() | StandardScaler() |
| 0.761 | 0.735 | 0.788 | 0.041 | OneHotEncoder() | MinMaxScaler() |
| 0.761 | 0.739 | 0.784 | 0.010 | OneHotEncoder() | StandardScaler() |
| 0.761 | 0.742 | 0.781 | 99.964 | OneHotEncoder() | StandardScaler() |
| 0.761 | 0.738 | 0.784 | 0.010 | OneHotEncoder() | StandardScaler() |
| 0.760 | 0.740 | 0.780 | 0.010 | OneHotEncoder() | StandardScaler() |
| 0.760 | 0.731 | 0.788 | 0.010 | OneHotEncoder() | StandardScaler() |
| 0.759 | 0.734 | 0.785 | 0.108 | OneHotEncoder() | None |
| 0.757 | 0.732 | 0.782 | 99.936 | OneHotEncoder() | None |
| 0.756 | 0.728 | 0.784 | 42.974 | OneHotEncoder() | MinMaxScaler() |
# gives the score rank for each index
# e.g. array([4, 2, 1, 3)
# the 1st iteration (i.e. set of params) was the worst
# the 3rd iteration was the best.
results.primary_score_trial_ranking
array([39, 35, 38, 25, 28, 40, 32, 36, 33, 34, 18, 9, 8, 22, 29, 19, 3,
14, 21, 2, 26, 31, 1, 16, 10, 5, 23, 11, 4, 12, 20, 17, 37, 15,
13, 24, 7, 6, 30, 27])
# gives the
# e.g. parser.primary_score_iteration_ranking of array([4, 2, 1, 3)
# would return [2, 1, 4, 0] because index 2 (i.e. 3rd iteration) was the best, so it is the first index;
# and index 0 (i.e. first iteration) was the was
results.primary_score_best_indexes
array([22, 19, 16, 28, 25, 37, 36, 12, 11, 24, 27, 29, 34, 17, 33, 23, 31,
10, 15, 30, 18, 13, 26, 35, 3, 20, 39, 4, 14, 38, 21, 6, 8, 9,
1, 7, 32, 2, 0, 5])
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(size=None, color='C').show()
results.plot_performance_across_trials(size='C', color='scaler').show()
results.plot_parameter_values_across_trials().show()
results.plot_scatter_matrix(height=800, width=800 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params()
results.plot_parallel_coordinates().show()
results.plot_performance_non_numeric_params()
results.plot_score_vs_parameter(
parameter='C',
color='scaler'
)
results.plot_score_vs_parameter(
parameter='C',
color='encoder'
)
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | C | encoder | scaler | |
|---|---|---|---|---|
| 22 | 0.772646 | 0.096368 | OneHotEncoder() | StandardScaler() |
| 19 | 0.769496 | 0.133606 | OneHotEncoder() | StandardScaler() |
| 16 | 0.769168 | 0.161823 | OneHotEncoder() | StandardScaler() |
| 28 | 0.767826 | 0.091205 | OneHotEncoder() | MinMaxScaler() |
| 25 | 0.767369 | 0.018692 | OneHotEncoder() | MinMaxScaler() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'C': 'C',
'encoder': 'encoder',
'scaler': 'scaler'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + encoder + scaler
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.826
Model: OLS Adj. R-squared: 0.807
Method: Least Squares F-statistic: 41.65
Date: Mon, 31 Jan 2022 Prob (F-statistic): 7.61e-13
Time: 08:33:49 Log-Likelihood: 147.23
No. Observations: 40 AIC: -284.5
Df Residuals: 35 BIC: -276.0
Df Model: 4
Covariance Type: nonrobust
==============================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------
Intercept 0.7318 0.003 257.566 0.000 0.726 0.738
encoder[T.OneHotEncoder()] 0.0278 0.002 11.299 0.000 0.023 0.033
scaler[T.None] -0.0061 0.003 -1.963 0.058 -0.012 0.000
scaler[T.StandardScaler()] 0.0025 0.003 0.971 0.338 -0.003 0.008
C -5.014e-05 2.65e-05 -1.889 0.067 -0.000 3.75e-06
==============================================================================
Omnibus: 3.440 Durbin-Watson: 1.290
Prob(Omnibus): 0.179 Jarque-Bera (JB): 2.623
Skew: -0.624 Prob(JB): 0.269
Kurtosis: 3.133 Cond. No. 203.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'C'] ['encoder', 'scaler']
| roc_auc_Mean | C | encoder | scaler | |
|---|---|---|---|---|
| 0 | 1.467527 | -0.812032 | OneHotEncoder() | StandardScaler() |
| 1 | 1.252326 | -0.811111 | OneHotEncoder() | StandardScaler() |
| 2 | 1.229881 | -0.810413 | OneHotEncoder() | StandardScaler() |
| 3 | 1.138232 | -0.81216 | OneHotEncoder() | MinMaxScaler() |
| 4 | 1.106986 | -0.813953 | OneHotEncoder() | MinMaxScaler() |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['C'] = score_dataframe_transformed['C'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + encoder + scaler
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.826
Model: OLS Adj. R-squared: 0.807
Method: Least Squares F-statistic: 41.65
Date: Mon, 31 Jan 2022 Prob (F-statistic): 7.61e-13
Time: 08:33:52 Log-Likelihood: -21.738
No. Observations: 40 AIC: 53.48
Df Residuals: 35 BIC: 61.92
Df Model: 4
Covariance Type: nonrobust
==============================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------
Intercept -1.4359 0.175 -8.203 0.000 -1.791 -1.081
encoder[T.OneHotEncoder()] 1.9014 0.168 11.299 0.000 1.560 2.243
scaler[T.None] -0.4188 0.213 -1.963 0.058 -0.852 0.014
scaler[T.StandardScaler()] 0.1702 0.175 0.971 0.338 -0.186 0.526
C -0.1385 0.073 -1.889 0.067 -0.287 0.010
==============================================================================
Omnibus: 3.440 Durbin-Watson: 1.290
Prob(Omnibus): 0.179 Jarque-Bera (JB): 2.623
Skew: -0.624 Prob(JB): 0.269
Kurtosis: 3.133 Cond. No. 5.46
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| encoder[T.OneHotEncoder()] | encoder[T.OneHotEncoder()] | 1.901417 | 3.162368e-13 | True |
| scaler[T.None] | scaler[T.None] | -0.418772 | 5.758616e-02 | False |
| scaler[T.StandardScaler()] | scaler[T.StandardScaler()] | 0.170174 | 3.382277e-01 | False |
| C | C | -0.138519 | 6.719941e-02 | False |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
forest = bayes_search.best_estimator_['model']
start_time = time.time()
result = permutation_importance(
bayes_search.best_estimator_, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 2.049 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.